In [1]:
# Data manipulation
import pandas as pd
import numpy as np
from numpy import linalg



# Support libs
import glob
import json
import datetime
from scipy import stats
import os

import requests
import subprocess

import re
import random

# Plotting
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import seaborn as sns

# Statistical tools
from scipy.spatial.distance import squareform, pdist
from scipy import stats
from scipy.stats import gamma, pearsonr
from scipy import signal as sp_signal

from sklearn import ensemble
from sklearn.cluster import DBSCAN
from shapely.geometry import MultiPoint
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import statsmodels.api as sm

### Zip Code database API
from uszipcode import ZipcodeSearchEngine

# Display full column width
pd.set_option('display.max_colwidth', -1)
from IPython.display import display, HTML
/Users/ross/anaconda/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
In [2]:
import plotly
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
import colorlover as clv

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.offline.offline import _plot_html

import cufflinks as cf
plotly.offline.init_notebook_mode() # run at the start of every notebook

Helper Functions

In [3]:
def displayHTML(in_df):
    display(HTML(in_df.to_html()))
In [4]:
def Lowess(data, f=2. / 3., pts=None, itn=3):
    """Lowess(s, f=2./3., pts=None, itn=3) -> yEst
    Lowess smoother: Robust locally weighted regression.
    The lowess function fits a nonparametric regression curve to a scatterplot.
 
    data:   (pandas.Series) contains data points in the scatterplot. The
            function returns the estimated (smooth) values of y.
    f:      (float) the fraction of the data set to use for smoothing. A
            larger value for f will result in a smoother curve.
    pts:    (int) Optionally, the explicit number of data points to be used for
            smoothing can be passed through pts instead of f.
    itn:    (int) The number of robustifying iterations. The function will run
            faster with a smaller number of iterations.
    yEst:   (pandas.Series) The return value containing the smoothed data.
    """
    # Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
    #          Dan Neuman (conversion to Pandas series)
    # License: BSD (3-clause)
 
    x = np.array(data.index)
    y = (data.values)
    n = len(data)
    if pts is None:
        f = np.min([f, 1.0])
        r = int(np.ceil(f * n))
    else:  # allow use of number of points to determine smoothing
        r = int(np.min([pts, n]))
    h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]
    w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)
    w = (1 - w ** 3) ** 3
    yEst = np.zeros(n)
    delta = np.ones(n)
    for iteration in range(itn):
        for i in range(n):
            weights = delta * w[:, i]
            b = np.array([np.sum(weights * y), np.sum(weights * y * x)])
            A = np.array([[np.sum(weights), np.sum(weights * x)],
                          [np.sum(weights * x), np.sum(weights * x * x)]])
            beta = linalg.solve(A, b)
            yEst[i] = beta[0] + beta[1] * x[i]
 
        residuals = y - yEst
        s = np.median(np.abs(residuals))
        delta = np.clip(residuals / (6.0 * s), -1, 1)
        delta = (1 - delta ** 2) ** 2
    return pd.Series(yEst, index=data.index, name='Trend')
In [5]:
def non_persistent_ts( in_df ):
    window_null_ct_df = unstacked_df.apply( lambda x: x.isnull() )\
                                    .rolling(4)\
                                    .sum()

    f = lambda x: 1 if x==4 else 0
    window_null_ct_df = window_null_ct_df.applymap( f ).reset_index()

    window_null_ct_df["year"] = window_null_ct_df["local_datetime"].map( lambda x: x.year )

    year_ct = window_null_ct_df.groupby("year")\
                     .count()

    year_sum = window_null_ct_df.groupby("year")\
                      .sum()

    window_null_rates = (year_sum/year_ct).T[:-1].reset_index()

    drop_list = list(window_null_rates[window_null_rates[2016]>.2]["geo_cluster"])
    keep_list = [i for i in unstacked_df.columns if i not in drop_list]
    
    return keep_list, drop_list, window_null_rates
In [6]:
def treat_ts( in_df, keep_list ):
    # drop the bad columns
    in_df = in_df[keep_list]

    # interpolate the missing records
    return in_df.loc[:,0:].apply(lambda x: x.interpolate(method='spline', order=3))
In [7]:
def smoother( in_df ):
    
    # lowess lamda
    f = lambda x: Lowess(x, f=2. / 3., pts=None, itn=3)
    
    # drop index and apply the lowess
    smoother_df = in_df.reset_index(level=[0], drop=True).fillna(0).copy()
    smoother_df = smoother_df.apply(f, axis=0)
    
    smoother_df["local_datetime"] = unstacked_df.index
    smoother_df = smoother_df.set_index("local_datetime")
    
    return smoother_df
In [8]:
def detrend( smoother_df, unstacked_df ):
    
    smoother_df["local_datetime"] =  unstacked_df.index
    smoother_df = smoother_df.set_index("local_datetime")

    # residuals_df = unstacked_df[target_with_neighbors]-smoother_df
    residuals_df = unstacked_df-smoother_df
    sq_residuals_df = residuals_df.apply(lambda x: x**2)
    return residuals_df, sq_residuals_df
In [9]:
def cluster_ts( input_series ):
    X = pd.DataFrame(input_series.fillna(0))

    # Compute DBSCAN
    db = DBSCAN(\
                  eps=.9\
                , min_samples=4\
                , leaf_size=30\
                , metric="euclidean"\
               ).fit(X)

    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    return db.labels_

Pull Data from S3

In [10]:
# Input Params
wd = "./../assets/data/aws-landing/"
start = "2016-01-01"
end = "2018-04-01"

# Gen Date Range
timeWindow = pd.date_range(start, end, freq=pd.tseries.offsets.DateOffset(days=1))
In [11]:
timeWindow
Out[11]:
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
               '2016-01-09', '2016-01-10',
               ...
               '2018-03-23', '2018-03-24', '2018-03-25', '2018-03-26',
               '2018-03-27', '2018-03-28', '2018-03-29', '2018-03-30',
               '2018-03-31', '2018-04-01'],
              dtype='datetime64[ns]', length=822, freq='<DateOffset: kwds={'days': 1}>')
request_log = {} # Loop Through Date Range for date in timeWindow: try: snap_date = str(date)[:10] # Format String for BASh Command bashCommand = """aws s3 cp s3://openaq-data/{0}.csv {1}{0}.csv""".format( snap_date, wd ) # Execute AWS CLI Command process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() # Log Outputs in a Dict, by Date request_log[snap_date] = [output, error] except: pass# request_log["2018-01-01"]
In [12]:
! ls ../assets/data/aws-landing/

Get a Feel for What's Going on with Cross Sectional Data

# Step through and concat three months at a time, by stepping through a listpath = r'./../assets/data/concat-data/' # use your path all_files = glob.glob(os.path.join(path, "*.csv")) # advisable to use os.path.join as this makes concatenation OS independent df_from_each_file = (pd.read_csv(f) for f in all_files) concatenated_df = pd.concat(df_from_each_file, ignore_index=True) concatenated_df = concatenated_df[concatenated_df["country"]=="US"].drop(["attribution","unit","utc"], axis=1)print concatenated_df["local"].min() print concatenated_df["local"].max() print concatenated_df.info() concatenated_df.head()concatenated_df.to_csv("./../assets/data/concat-data/2016-2018.csv", index=None)

Read the US DataSet

In [13]:
us_data = pd.read_csv("./../assets/data/concat-data/2016-2018.csv")
In [14]:
us_data.head(2)
Out[14]:
location city country local parameter value latitude longitude
0 MMFRA1001 FRA US 2018-02-04T03:00:00-08:00 pm25 21.000 39.075837 -121.634166
1 Sinclair In-Town CARBON US 2018-02-04T04:00:00-07:00 so2 -0.001 41.782370 -107.120830
In [15]:
len(us_data["location"].unique())
Out[15]:
1954

Filter the Dataset to pm2.5 Measuredments Only

In [16]:
us_data_pm25_df = us_data[us_data["parameter"]=="pm25"].copy()
In [17]:
print len(us_data_pm25_df["location"].unique())
us_data_pm25_df.head(3)
983
Out[17]:
location city country local parameter value latitude longitude
0 MMFRA1001 FRA US 2018-02-04T03:00:00-08:00 pm25 21.0 39.075837 -121.634166
5 Bayamon 37 (NCore) BAYAMON US 2018-02-04T07:00:00-04:00 pm25 8.3 18.419230 -66.150420
8 Murphy Ridge Evanston US 2018-02-04T04:00:00-07:00 pm25 20.2 41.369400 -111.041900

Datetime Calculations

In [18]:
print """Dates are available at local time and UTC, which
         normalizes for the day; however this also means that
         a single file will have data that spans over two days,
         due to the time zones. \n""" 
print "This is the local time: {} \n".format(us_data_pm25_df["local"][0])
# print "This is the utc (universaly coordinated time): {} \n".format(us_data["utc"][0])

print "The data are formatted as string types"
print type(us_data_pm25_df["local"][0])
Dates are available at local time and UTC, which
         normalizes for the day; however this also means that
         a single file will have data that spans over two days,
         due to the time zones. 

This is the local time: 2018-02-04T03:00:00-08:00 

The data are formatted as string types
<type 'str'>
In [19]:
# pd.DataFrame(date_map_df).head(2)

Performing Date Calculations on a Map DF for Performance

In [20]:
date_map_df = pd.DataFrame(us_data_pm25_df["local"].drop_duplicates())

# creating a datetime format, for SARIMAX later
date_map_df["local_datetime"] = pd.to_datetime(date_map_df["local"])

# extracting the time and day
date_map_df["local_hour"] = date_map_df["local_datetime"].dt.hour
date_map_df["local_day"] = date_map_df["local_datetime"].dt.day

Merge Date Calculations in

In [21]:
us_data_pm25_df = us_data_pm25_df.merge( date_map_df, on="local", how="left" )

Look at Periodicity / Cardinality

In [22]:
print """ Checking the distribution of reading times to sample times 
            that all locations have in common. \n
            We want to normalize for factors like rush hour. \n
            It may be useful to think about the latency in terms of 
            emissions affects on the readings. \n"""

reading_time_cts = us_data_pm25_df[["local_hour","location"]].groupby("local_hour")\
                            .count()\
                            .rename(columns={"location":"count"}).reset_index()\
                            .sort_values(by="local_hour")
 Checking the distribution of reading times to sample times 
            that all locations have in common. 

            We want to normalize for factors like rush hour. 

            It may be useful to think about the latency in terms of 
            emissions affects on the readings. 

In [23]:
df = reading_time_cts
fig = {
    'data' : [{
        'type' : 'bar',
        'y' : df["count"].tolist(),
        'x' : df.index.tolist(),
        'orientation' : 'v'
        
    }],
    'layout' : {
        "title": "Count of Readings by Hour"
        , 'xaxis': {'title': 'Hour of Day'}
        , 'yaxis': {'title': "Count"}
    }
}

iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
In [24]:
print """There are {} unique locations.

        We are going to find a time of day that has this number of unique locations.
        
        We will need to think about whether or not the times coincide with metrics, 
        if we are to look at multivariate distributions.
        
        If we can't find an exact time, then we might have to choose bands.
        """.format(len(us_data_pm25_df["location"].unique()))
There are 983 unique locations.

        We are going to find a time of day that has this number of unique locations.
        
        We will need to think about whether or not the times coincide with metrics, 
        if we are to look at multivariate distributions.
        
        If we can't find an exact time, then we might have to choose bands.
        

Resample the Time Series, so that Cities with More Measurements Don't Create Bias

In [25]:
# TODO: Change this to two hour windows and take the max
us_data_pm25_df = us_data_pm25_df[us_data_pm25_df["local_hour"].isin([10,18])]
In [26]:
# us_data_pm25_df.to_csv("../assets/data/concat-data/2016-2018-us-pm25.csv")

# us_data_pm25_df = pd.read_csv("../assets/data/concat-data/2016-2018-us-pm25.csv")

Define the Networks

Clustering Cities on Distance with k-means

  • We're using k-means here because it produces spheroidal shapes which we will transform into networks to understand how cities impact eachother and covary with respect to pm2.5 levels

  • Another way to think about this is that we are performing a variation of hierarchical clustering, except with two steps that only consider the existing centroid

Let:

  1. $c_i \in \{ 1, 2, \ ... \ , k \}$ represent k clusters
  2. $\boldsymbol \mu_k$ represent cluster centroids
  3. $\mu_{c_i}$ represent assignment

We will use this cost function:

$J(c_1, \ ... \ , c_m, \boldsymbol \mu_1, \ ... \ , \boldsymbol \mu_k) = \cfrac{1}{m} \sum_i \left\| \mathbf x_i - \boldsymbol \mu_{c_i} \right\|^2$

  • We will optimize through the minimization $\min J(c_1, \ ... \ , c_m, \boldsymbol \mu_1, \ ... \ , \boldsymbol \mu_k)$ with respect to $c_1, \ ... \ , c_m, \boldsymbol \mu_1, \ ... \ , \boldsymbol \mu_k$
In [27]:
# Create a DataFrame of Unique Location Coordinates of ALL sensors in the US
coords_df = us_data_pm25_df[['location','latitude', 'longitude']].drop_duplicates()\
                                                            .reset_index()\
                                                            .drop("index",axis=1)
# Prep the Coordinate DF for a distance matrix
coords = coords_df.as_matrix(columns=['latitude', 'longitude'])
In [28]:
print len(coords)
print len(coords_df)
1343
1343

Plot Unique Sensors to Understand the Frequency

In [29]:
cf.set_config_file(offline=True, world_readable=True, theme='pearl')

df = coords_df[\
                (coords_df["longitude"].between(-145, -60))\
               &(coords_df["latitude"].between(20, 55))]\
                [["latitude","longitude"]]
    
df["geo_cluster"] = 1

fig = {
    'data': [
        {
            'x': df[df['geo_cluster']==geo_cluster]['longitude'],
            'y': df[df['geo_cluster']==geo_cluster]['latitude'],
            'name': geo_cluster, 'mode': 'markers',
        } for geo_cluster in df["geo_cluster"].unique()
    ],
    'layout': {
        "title": "Air Quality Sensors in the US"
        , 'xaxis': {'title': 'Longitude'}
        , 'yaxis': {'title': "Latitude"}
    }
}

plotly.offline.iplot(fig)
In [30]:
# create feature vectors to train a kmeans model
locations = coords_df['location'].values
f1 = coords_df['latitude'].values
f2 = coords_df['longitude'].values
X = np.array(list(zip(f1, f2)))
In [31]:
# Looping through silhouette scores to find an optimal compactness and isolation
cluster_range = range( 30, 85 )
cluster_errors = []
cluster_sil_means = []

for num_clusters in cluster_range:
    clusters = KMeans( num_clusters, random_state=0 )
    clusters.fit( X )
    
    cluster_errors.append( clusters.inertia_ )
    
    cluster_labels = clusters.fit_predict( X )

    silhouette_avg = silhouette_score( X, cluster_labels )
    cluster_sil_means.append( silhouette_avg )
    
clusters_df = pd.DataFrame( {\
                               "num_clusters":cluster_range\
                             , "cluster_errors": cluster_errors
                             , "mean_sil": cluster_sil_means } )
In [32]:
plotly.offline.iplot({
"data": [{
    "x": clusters_df.num_clusters,
    "y": clusters_df.cluster_errors
}],
"layout": {
    "title": "Inertia by Number of Geo Clusters"
    , "xaxis": {"title":"K Clusters"}
    , "yaxis": {"title":"SSE"}
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)

plotly.offline.iplot({
"data": [{
    "x": clusters_df.num_clusters,
    "y": clusters_df.mean_sil
}],
"layout": {
    "title": "Mean Silhouette Score by Number of Geo Clusters"
    , "xaxis": {"title":"K Clusters"}
    , "yaxis": {"title":"Silhoutte Score"}
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
In [33]:
sil_info = clusters_df[clusters_df.mean_sil==clusters_df.mean_sil.max()]
k = sil_info.num_clusters.values[0]

print "At k={0} the silhouette score is maximized at {1}".format(\
                                                                   k\
                                                                 , sil_info.mean_sil.values[0] )
At k=72 the silhouette score is maximized at 0.565156528546
In [34]:
# Number of clusters
kmeans = KMeans(n_clusters=k, random_state=0)
# Fitting the input data
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
In [35]:
# centroids

Label the Clusters

In [36]:
coords_df["geo_cluster"] = labels
In [37]:
# group the cluster counts and plot their frequencies
cluster_counts = coords_df[["geo_cluster","location"]].groupby("geo_cluster")\
                                        .count()\
                                        .rename(columns={"location":"count"})\
                                        .sort_values(by="count", ascending=False)\
                                        .reset_index()
In [38]:
df = cluster_counts
fig = {
    'data' : [{
        'type' : 'bar',
        'y' : df["count"].tolist(),
        'x' : df.index.tolist(),
        'orientation' : 'v'
        
    }],
    'layout' : {
        "title": "Count of Sensors by Cluster"
        , 'xaxis': {'title': 'Cluster Label'}
        , 'yaxis': {'title': "Count"}
    }
}

iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
In [39]:
cmap = []

for i in range(len(coords_df["geo_cluster"].unique())/6):
    cmap.append([j for j in range(2,8)]) 
    
flattened_cmap = [y for x in cmap for y in x]

color_dict = dict(zip(coords_df["geo_cluster"].unique(),flattened_cmap))
In [40]:
cf.set_config_file(offline=True, world_readable=True, theme='pearl')

bupu = clv.scales['9']['seq']['PuBu']

df = coords_df[\
                (coords_df["longitude"].between(-145, -60))\
               &(coords_df["latitude"].between(20, 55))]\
                [["latitude","longitude","geo_cluster"]]
fig = {
    'data': [
        {
              'x': df[df['geo_cluster']==geo_cluster]['longitude']
            , 'y': df[df['geo_cluster']==geo_cluster]['latitude']
            , 'name': geo_cluster, 'mode': 'line'
            , "marker":dict( color=bupu[color_dict[geo_cluster]])
        } for geo_cluster in df["geo_cluster"].unique()
                

        
    ],
    'layout': {
        "title": "Sensors in the US by Geospatial Cluster"
        , 'xaxis': {'title': 'Longitude'}
        , 'yaxis': {'title': "Latitude"}
        , "shapes": [
                    {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': min(df[df['geo_cluster']==geo_cluster]['longitude']),
            'y0': min(df[df['geo_cluster']==geo_cluster]['latitude']),
            'x1': max(df[df['geo_cluster']==geo_cluster]['longitude']),
            'y1': max(df[df['geo_cluster']==geo_cluster]['latitude']),
            'opacity': 0.2,
            'fillcolor': bupu[color_dict[geo_cluster]],
            "line":dict( color=bupu[color_dict[geo_cluster]])
        } for geo_cluster in df["geo_cluster"].unique()        
        ]

    }    
}

plotly.offline.iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)

Calculate the Closest Neighbors

In [41]:
# Merge Geo Clusters in on lat lng for precision
working_df = us_data_pm25_df.merge(\
                                     coords_df[["latitude","longitude","location","geo_cluster"]]\
                                   , on=["latitude","longitude","location"], how="left" )
working_df.head(3)
Out[41]:
location city country local parameter value latitude longitude local_datetime local_hour local_day geo_cluster
0 MMFRA1001 FRA US 2018-02-04T10:00:00-08:00 pm25 -3.0 39.07573 -121.634016 2018-02-04 18:00:00 18 4 58
1 Bayamon 37 (NCore) BAYAMON US 2018-02-04T14:00:00-04:00 pm25 9.4 18.41923 -66.150420 2018-02-04 18:00:00 18 4 13
2 Murphy Ridge Evanston US 2018-02-04T11:00:00-07:00 pm25 20.4 41.36940 -111.041900 2018-02-04 18:00:00 18 4 15
In [42]:
"{} pct null labels".format(\
                              len(working_df[working_df["geo_cluster"].isnull()])\
                              /(len(working_df)*1.00))
Out[42]:
'0.0 pct null labels'

Compute Parent Clusters

In [43]:
# Calculate the mid point
midpoints_df = working_df[[\
                             "geo_cluster"\
                           , "latitude"\
                           , "longitude"\
                          ]].groupby("geo_cluster")\
                            .mean()\
                            .reset_index()

# Calculate a distance matrix
dist_matrix = pd.DataFrame(\
                             squareform(pdist( midpoints_df.iloc[:, 1:] ))\
                           , columns= midpoints_df.geo_cluster.unique()\
                           , index= midpoints_df.geo_cluster.unique() )

# 1 degree is about 111km
dist_matrix_km = dist_matrix*111
In [ ]:
 
In [44]:
def flag_neighbor(x):
    if (int(x) !=0) & (int(x) <=400):
        return 1
    else:
        return 0


dist_matrix_dummies = dist_matrix_km.applymap(flag_neighbor)\
                                    .reset_index()\
                                    .rename(columns={"index":"geo_cluster"})
In [45]:
print "Now we have a list of all of the clusters as well as the distance matrix transformed:"
print dist_matrix_dummies["geo_cluster"].unique()

# hot encoding for neighbors
dist_matrix_dummies.head()
Now we have a list of all of the clusters as well as the distance matrix transformed:
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
Out[45]:
geo_cluster 0 1 2 3 4 5 6 7 8 ... 62 63 64 65 66 67 68 69 70 71
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
2 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 73 columns

In [46]:
# Create a dictionary that contains a list of neighbors for each cluster
# We can pass this in as a filter to make it easier to map model fit logic later
neighbor_dict = {}

# Loop through the unique clusters
for cluster in dist_matrix_dummies["geo_cluster"].unique():
    # Transpose the row into a column vector
    neighbor_flags = dist_matrix_dummies[dist_matrix_dummies["geo_cluster"]==cluster].iloc[:, 1:].T
    neighbor_flags = neighbor_flags.rename(\
                                           columns={\
                                                      neighbor_flags.columns.values[0]:"neighbor_ind"\
                                                    })\

    
    # When there is a neighbor flag, take the index and store it in a dictionary
    neighbor_list = list(neighbor_flags[neighbor_flags["neighbor_ind"]==1].index)
    neighbor_dict[cluster] = neighbor_list
In [47]:
neighbor_dict
Out[47]:
{0: [48],
 1: [16, 20, 50, 57, 58, 62],
 2: [],
 3: [22, 25, 59],
 4: [11, 23, 42],
 5: [54],
 6: [53],
 7: [],
 8: [38, 47],
 9: [45],
 10: [27, 32, 66],
 11: [4, 26, 42, 44],
 12: [29, 63],
 13: [],
 14: [31],
 15: [],
 16: [1, 24, 50, 51, 57, 62],
 17: [31, 56, 71],
 18: [33, 61],
 19: [28],
 20: [1, 50, 58, 62],
 21: [],
 22: [3, 43, 59],
 23: [4],
 24: [16, 41, 50, 57, 62],
 25: [3],
 26: [11, 44, 46],
 27: [10, 35, 65],
 28: [19, 54, 60],
 29: [12, 43, 55],
 30: [],
 31: [14, 17, 36],
 32: [10, 66],
 33: [18, 61, 66],
 34: [],
 35: [27, 65],
 36: [31],
 37: [],
 38: [8, 47, 49],
 39: [48],
 40: [],
 41: [24, 51, 57],
 42: [4, 11],
 43: [22, 29, 55],
 44: [11, 26],
 45: [9, 70],
 46: [26],
 47: [8, 38, 60],
 48: [0, 39],
 49: [38],
 50: [1, 16, 20, 24, 57, 58, 62],
 51: [16, 41, 57],
 52: [],
 53: [6],
 54: [5, 28],
 55: [29, 43],
 56: [17],
 57: [1, 16, 24, 41, 50, 51, 62],
 58: [1, 20, 50, 62],
 59: [3, 22],
 60: [28, 47],
 61: [18, 33, 66],
 62: [1, 16, 20, 24, 50, 57, 58],
 63: [12],
 64: [],
 65: [27, 35],
 66: [10, 32, 33, 61],
 67: [],
 68: [],
 69: [],
 70: [45],
 71: [17]}

Filter to a Single Geo Cluster, at a Single Time (the AM for now) to Test a GMM

In [48]:
# pick a single cluster
target = 50

# fetch the neighbors of that cluster 
target_neighbors = neighbor_dict[target]

# create a dataframe that contains a label for neighbors of that cluster 
filterd_df = working_df[['geo_cluster','latitude', 'longitude']].drop_duplicates().copy()

filterd_df["filter_cluster"] = "Not in Network"
filterd_df.loc[(filterd_df["geo_cluster"]==target),"filter_cluster"] = "Cluster No. {0}".format(target)
filterd_df.loc[(filterd_df["geo_cluster"].isin(target_neighbors)),"filter_cluster"] = "In Network"

cmap = ["0", "1", "2"]
color= filterd_df['filter_cluster'].astype(str)
In [49]:
labels = filterd_df["filter_cluster"].unique()
In [50]:
labels
Out[50]:
array(['In Network', 'Not in Network', 'Cluster No. 50'], dtype=object)
In [51]:
print """
            We have just created a dictionary that has a list of neighboring clusters for
            each cluster. 
            
            - We should see a ring surrounding the target cluster below if we choose a 
                color mapping with three colors and assign them to:
                1. the target cluster
                2. the neighbors in the target cluster
                3. all clusters that are not neighbors

"""

cf.set_config_file(offline=True, world_readable=True, theme='pearl')

df = filterd_df[\
                (filterd_df["longitude"].between(-145, -60))\
               &(filterd_df["latitude"].between(20, 55))]\
                [["latitude","longitude","filter_cluster"]]
fig = {
    'data': [
        {
            'x': df[df['filter_cluster']==labels[1]]['longitude'],
            'y': df[df['filter_cluster']==labels[1]]['latitude'],
            'name': labels[1], 'mode': 'markers',
            "line":dict( color=bupu[3])
        },
        {
            'x': df[df['filter_cluster']==labels[0]]['longitude'],
            'y': df[df['filter_cluster']==labels[0]]['latitude'],
            'name': labels[0], 'mode': 'markers',
            "line":dict( color=bupu[5])
        },
        {
            'x': df[df['filter_cluster']==labels[2]]['longitude'],
            'y': df[df['filter_cluster']==labels[2]]['latitude'],
            'name': labels[2], 'mode': 'markers',
            "line":dict( color=bupu[7])
        }
    ],
    'layout': {
        "title": "Sensors in the US by Geospatial Cluster: Emphasis on Cluster No. {0}".format(target)
        , 'xaxis': {'title': 'Longitude'}
        , 'yaxis': {'title': "Latitude"}
    }
}

plotly.offline.iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
            We have just created a dictionary that has a list of neighboring clusters for
            each cluster. 
            
            - We should see a ring surrounding the target cluster below if we choose a 
                color mapping with three colors and assign them to:
                1. the target cluster
                2. the neighbors in the target cluster
                3. all clusters that are not neighbors


Locations and Dups

In [52]:
print "there are dups, even when controlling for location, date, and metric"

location_reading_ct = us_data_pm25_df[us_data_pm25_df['parameter']=="pm25"][["location","city"]].groupby("location")\
                                                            .count()\
                                                            .sort_values(by="city", ascending=False)
        
displayHTML( location_reading_ct.head() )
            
            
print """
        some regions have readings for multiple times in the day.
        we will need to normalize for this because rush hour will
        probably elevate readings.
        """
displayHTML( us_data_pm25_df[(us_data_pm25_df['location']=="NCORE")&(us_data_pm25_df['parameter']=="pm25")].head(3) )
there are dups, even when controlling for location, date, and metric
city
location
Lebanon 3331
NCORE 2965
Lewiston 2262
Near Road 1878
Columbia 1298
        some regions have readings for multiple times in the day.
        we will need to normalize for this because rush hour will
        probably elevate readings.
        
location city country local parameter value latitude longitude local_datetime local_hour local_day
2935 NCORE LEWIS AND CLARK US 2018-02-04T11:00:00-07:00 pm25 4.2 46.850500 -111.987164 2018-02-04 18:00:00 18 4
3072 NCORE BROWARD US 2018-02-04T13:00:00-05:00 pm25 9.1 26.054047 -80.257608 2018-02-04 18:00:00 18 4
3097 NCORE Omaha-Council Bluffs US 2018-02-04T12:00:00-06:00 pm25 6.0 41.247400 -95.973100 2018-02-04 18:00:00 18 4

Fit Kernel Density Estimation

In [53]:
xn = location_reading_ct["city"]

gkde=stats.gaussian_kde(xn)

ind = np.linspace(0,3000,100)
kdepdf = gkde.evaluate(ind)

kdedf = pd.DataFrame(kdepdf)
In [54]:
print """
        Below is a distribution of samples obtained by each sensor location (measuring pm25) over the entire
        window of observation. There are a few interesting takeaways here:
        
            1. the distribution is not uniform, which may be due to sensors being added over time and 
                therefore having less readings. This means that:
                
                - we need to be careful when modeling the time series
                - the matrix operations will necessitate equal column vector lengths, so we should be thoughtful
                    about imputation
                    
            2. the multimodal nature of the distribution suggests a couple of possibilities:
                - a single location might have multiple readings, which when modelled, could lead to
                    variance inflation or biased results, deppending on if or which parametric approach
                    we apply

"""

plotly.offline.iplot({
"data": [{
    "x": ind,
    "y": kdepdf
        , 'fill':'tozeroy'
}],
"layout": {
    "title": "Distribution of Samples by Sensor"
    , "xaxis": {"title":"Count of Samples"}
    , "yaxis": {"title":"p(X=x)"}

}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
        Below is a distribution of samples obtained by each sensor location (measuring pm25) over the entire
        window of observation. There are a few interesting takeaways here:
        
            1. the distribution is not uniform, which may be due to sensors being added over time and 
                therefore having less readings. This means that:
                
                - we need to be careful when modeling the time series
                - the matrix operations will necessitate equal column vector lengths, so we should be thoughtful
                    about imputation
                    
            2. the multimodal nature of the distribution suggests a couple of possibilities:
                - a single location might have multiple readings, which when modelled, could lead to
                    variance inflation or biased results, deppending on if or which parametric approach
                    we apply


Treat the Time Series

Get a grouped mean for all sensors in a cluster, for each time period

- we'll do this for one time reading, so we can control for variane related to the time of day
In [55]:
grouped_ts_df = working_df[working_df["local_hour"]==18]\
                            [["geo_cluster","location","local_datetime","value"]]\
                                .groupby(["geo_cluster", "local_datetime"])\
                                .mean()\
                                .reset_index()\

Unstack the time series, so that each cluster has it's own column vector

In [56]:
df = grouped_ts_df[["geo_cluster","local_datetime","value"]].drop_duplicates()\
                                                            .copy()

# set a datetime64 dtype to the date, so that we can use it as an index    
df.local_datetime = pd.to_datetime(df.local_datetime)
In [57]:
# set the index as the date field
df = df.set_index('local_datetime') 

# create a generator function for groupby operation, that resamples to a 1 day
## for the entire dataset, and then unstack it into a column for each cluster
grouper = df.groupby([pd.Grouper(freq='1D'), "geo_cluster"])
unstacked_df = grouper['value'].sum().unstack('geo_cluster')

Before Interpolating, Calculate Record Persistence

  • Min date
  • Max date
  • Count of Nulls
  • Months diff
In [58]:
print """
         The time series for the cluster below illustrates something that we need 
         to be mindful of: 
         
         - The measurements do not persist throughout the time series,
         so if we thoughtlessly interpolated the null values, LOWESS will output
         singular matrix errors.
"""

problem_cluster = 13

plotly.offline.iplot({
"data": [
    {
      "x": unstacked_df.index
    , "y": unstacked_df[problem_cluster]
    ,"name": "Cluster No. {0}".format(target)
    , "line":dict( color='tozeroy')
    }


],
"layout": {
      "title": "PM2.5 Measurements for Cluster No. {0} (Daily)".format( problem_cluster )

    , "yaxis": {"title":"µg/m3"}
    , "legend": dict(orientation="h")
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
         The time series for the cluster below illustrates something that we need 
         to be mindful of: 
         
         - The measurements do not persist throughout the time series,
         so if we thoughtlessly interpolated the null values, LOWESS will output
         singular matrix errors.

In [59]:
#####################################################################################
################################ Test ###############################################
#####################################################################################
keep_list, drop_list, window_null_rates = non_persistent_ts( unstacked_df )
#####################################################################################
################################ Test ###############################################
#####################################################################################
print drop_list
print keep_list

displayHTML(window_null_rates[window_null_rates[2016]>.2])
displayHTML(window_null_rates[window_null_rates[2017]>.2])
displayHTML(window_null_rates[window_null_rates[2018]>.2])
[6, 9, 13, 30, 52]
[0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]
year geo_cluster 2016 2017 2018
6 6 0.495677 0.000000 0.000000
9 9 0.230548 0.000000 0.000000
13 13 0.608069 0.339161 0.621622
30 30 0.216138 0.003497 0.000000
52 52 0.570605 0.000000 0.000000
year geo_cluster 2016 2017 2018
13 13 0.608069 0.339161 0.621622
year geo_cluster 2016 2017 2018
13 13 0.608069 0.339161 0.621622

Drop bad columns and interpolate between null values with cubic splines

In [60]:
#####################################################################################
################################ Test ###############################################
#####################################################################################
unstacked_df = treat_ts( unstacked_df, keep_list )
#####################################################################################
################################ Test ###############################################
#####################################################################################
In [61]:
print """ 
            Now all clusters have the same time series, even if there were no readings, and this
            is apparent because there are null observations if we print the beginning of the 
            observation window:
"""
displayHTML( unstacked_df.head(3) )

print """ 
            In more recent readings, the data are well populated:
"""

displayHTML( unstacked_df.tail(3) )
 
            Now all clusters have the same time series, even if there were no readings, and this
            is apparent because there are null observations if we print the beginning of the 
            observation window:

geo_cluster 0 1 2 3 4 5 7 8 10 11 12 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
local_datetime
2016-01-04 NaN NaN NaN NaN NaN NaN NaN 18.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2016-01-05 NaN NaN NaN NaN NaN NaN NaN 11.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2016-01-12 NaN NaN NaN NaN NaN NaN NaN 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
 
            In more recent readings, the data are well populated:

geo_cluster 0 1 2 3 4 5 7 8 10 11 12 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
local_datetime
2018-03-27 6.118182 3.500000 2.625 3.483333 2.426087 9.25 6.25 15.533333 6.200000 1.972727 -0.200000 6.865217 2.458333 1.1 10.500000 7.477778 8.662500 3.718750 3.70 5.742105 1.20 2.5200 6.116667 2.3200 4.22 2.671429 6.550000 13.772727 10.210000 6.250000 2.257143 6.485714 6.586667 3.666667 9.314286 8.300000 12.125000 4.043750 2.000000 5.248000 1.642857 -0.066667 3.928571 5.285714 7.815789 22.50 4.400 2.1000 2.571429 6.7625 5.281818 5.000000 11.98 3.750000 6.352632 7.033333 7.150000 10.8 2.500000 7.150000 4.833333 8.10 2.7 14.25 3.375 7.600049 3.504191
2018-03-29 2.933333 5.571429 6.700 5.120000 4.325000 3.20 5.00 6.628571 14.233333 3.815385 2.833333 7.936364 4.116667 2.0 4.666667 12.544444 4.011111 9.017647 6.26 14.772222 3.03 8.0375 6.461111 3.7750 7.17 4.662500 10.463636 7.283333 3.809091 10.827273 4.033333 10.657143 11.426667 6.142857 4.120000 6.000000 5.109091 17.238889 2.481818 12.378261 3.800000 5.233333 4.207143 3.500000 3.475000 7.40 11.250 3.2875 1.571429 8.7000 7.863636 0.666667 5.94 5.062500 17.305556 14.200000 7.685714 10.9 6.633333 1.966667 9.000000 7.60 1.7 4.45 2.725 6.700000 5.633333
2018-03-31 2.800000 10.142857 8.300 4.480000 2.838095 4.25 2.00 5.650000 2.766667 2.133333 1.000000 6.222727 4.160000 5.7 3.700000 3.247059 5.362500 10.182353 3.16 5.542857 2.85 8.4625 3.372222 4.5125 6.18 6.480000 3.731579 7.909091 3.783333 3.600000 4.250000 9.057143 7.966667 4.666667 5.866667 7.021429 4.936364 22.464706 2.663636 3.337500 3.835714 1.100000 5.142857 8.271429 3.180000 6.55 8.375 4.5625 1.428571 6.7375 4.827273 1.833333 8.10 7.642857 3.638889 8.075000 2.387500 8.7 4.800000 1.200000 7.042857 2.18 0.2 4.90 2.300 1.500000 2.500000

Pick a target and neighbors to test

In [62]:
# pick a single cluster
target = 50
# target = 29
# target = 48

# fetch the neighbors of that cluster 
target_neighbors = neighbor_dict[target]
target_with_neighbors = list(set([target]).union(set(target_neighbors)))


print "The target is cluster number: {0}".format(target)
print "The target's nearest neighbors are: {0}".format(target_neighbors)
print "The target and it's neighbors are: {0}".format(target_with_neighbors)
The target is cluster number: 50
The target's nearest neighbors are: [1, 16, 20, 24, 57, 58, 62]
The target and it's neighbors are: [1, 16, 50, 20, 24, 57, 58, 62]

Plot results of the unstacker

In [63]:
plotly.offline.iplot({
"data": [
    {
      "x": unstacked_df.index
    , "y": unstacked_df[target]
    ,"name": "Cluster No. {0}".format(target)
    , "line":dict( color='tozeroy')
    }
    ,     {
      "x": unstacked_df.index
    , "y": unstacked_df[target_neighbors].mean(axis=1)
    ,"name": "Mean of clusters: {0}".format(target_neighbors)
    , "line":dict( color='rgb(0,0,0)')
    }

],
"layout": {
      "title": "PM2.5 Measurements for Cluster No. {0} (Daily)".format(target)

    , "yaxis": {"title":"µg/m3"}
    , "legend": dict(orientation="h")
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)

Remove a Locally Smoothed Version of the Series

  • we are backing the trend out to enforce stationarity
  • LOWESS was chosen because it is easily interpretted and it does not have to conform to the same rules as OLS, so it will be more performant
In [64]:
#####################################################################################
################################ Test ###############################################
#####################################################################################
smoother_df = smoother( unstacked_df )
#####################################################################################
################################ Test ###############################################
#####################################################################################

Option 1: EWMA

  • We will compute this forward and backward, then average the two: ##### $ \Large{y_t = \frac{\sum\limits_{i=0}^t w_i x_{t-i}}{\sum\limits_{i=0}^t w_i}} $

Option 2: Hodrick-Prescott Filter

  • The bandpass will be decomposed into trend and seasonality: ##### $\Large{y_t = x_t + c_t}$
  • Here is the cost function that we want to minimize: #### $\Large{\min_{\\{ x_{t}\\} }\sum_{t}^{T} c_{t}^{2}+\lambda\displaystyle\sum_{t=1}^{T}\left[\left(x_{t}- x_{t-1}\right)-\left(x_{t-1}-x_{t-2}\right)\right]^{2}}$

Option 3: LOWESS

The tricube function:
  • We obtain weights, running OLS locally #### $\Large{w_i = {\left [ 1 - {\left |\frac{(x-x_i)}{d(x)}\right |} ^{\small{3}}\right ]}^{\small{3}}}$
The model:

$\Large{t_i(x_k) = w_i \frac{(x_i-x_k)}{d_i}}$

Model I: Hodrick Prescott Filter

In [65]:
x = range(len(unstacked_df.index)+1)
y = unstacked_df[target].dropna() #.values
In [66]:
lamb = 129600 # parameter for 12m smoothing
cycle_annual, trend_annual = sm.tsa.filters.hpfilter(y, lamb=lamb)
hp_decompose_df = pd.DataFrame(y)
hp_decompose_df['cycle'] = cycle_annual
hp_decompose_df['trend'] = trend_annual
hp_decompose_df=hp_decompose_df.rename(columns={"trend":"HP FILTER"}).reset_index()

Model II: Exponential Smoothing (forwards and backwards)

In [67]:
ewma = pd.Series.ewm

ewma_df = unstacked_df[unstacked_df[target].notnull()][target].reset_index()
x = ewma_df.index
y = ewma_df[target] #.values

df = pd.Series(y)

# take EWMA in both directions then average them
fwd = ewma(df,span=10).mean() # take EWMA in fwd direction
bwd = ewma(df[::-1],span=10).mean() # take EWMA in bwd direction
filtered = np.vstack(( fwd, bwd[::-1] )) # lump fwd and bwd together
filtered = np.mean(filtered, axis=0 ) # average
In [68]:
ewma_df["EWMA"] = filtered
ewma_df = ewma_df.drop(target, axis=1)

Model III: LOWESS

In [69]:
lowess = sm.nonparametric.lowess
sm_lowess_df = unstacked_df[target].dropna()
# sm_lowess_df = unstacked_df[target].fillna(0)

x=sm_lowess_df.index
y=sm_lowess_df

w = lowess(y, x, frac=1./3)
In [70]:
f = lambda x: x[0]
sm_lowess_df = pd.DataFrame(map(f,w[:,1:]), columns=["LOWESS"])
sm_lowess_df["local_datetime"]=x
In [71]:
merge_models_df = unstacked_df[target].reset_index().rename(columns={target:"EVIDENCE"})
merge_models_df = merge_models_df.merge( hp_decompose_df, on="local_datetime", how="left" )\
                                 .merge( ewma_df, on="local_datetime", how="left" )\
                                 .merge( sm_lowess_df, on="local_datetime", how="left" )
In [ ]:
 
In [72]:
merge_models_df["EVIDENCE"] = unstacked_df[target].values
plot_df=merge_models_df.dropna()
In [73]:
merge_models_df.head()
Out[73]:
local_datetime EVIDENCE 50 cycle HP FILTER EWMA LOWESS
0 2016-01-04 NaN NaN NaN NaN NaN NaN
1 2016-01-05 NaN NaN NaN NaN NaN NaN
2 2016-01-12 NaN NaN NaN NaN NaN NaN
3 2016-01-13 NaN NaN NaN NaN NaN NaN
4 2016-01-14 NaN NaN NaN NaN NaN NaN
In [74]:
merge_models_df["CATEGORY"] = "Model"
plot_df = merge_models_df.copy()

plot_df["EVIDENCE"] = merge_models_df["EVIDENCE"]
plot_df["HP FILTER"] = merge_models_df["EVIDENCE"]
plot_df["EWMA"] = merge_models_df["EVIDENCE"]
plot_df["LOWESS"] = merge_models_df["EVIDENCE"]
plot_df["CATEGORY"] = "Evidence"
In [75]:
plotly.offline.iplot({
"data": [
    
        {
      "x": merge_models_df.index
    , "y": merge_models_df["EVIDENCE"]
    ,"name": "Observations"
        , 'mode': 'markers'
    , "line":dict( color=bupu[2],width=1, shape="cross")
    },
    {
      "x": merge_models_df.index
    , "y": merge_models_df["HP FILTER"]
    ,"name": "HP FILTER"
    , "line":dict( color=bupu[8], width= 3)
    }
    ,  
    {
      "x": merge_models_df.index
    , "y": merge_models_df["EWMA"]
    ,"name": "EWMA"
    , "line":dict( color=bupu[6], width= 3)
    }
    ,  
    {
      "x": merge_models_df.index
    , "y": merge_models_df["LOWESS"]
    ,"name": "LOWESS"
    , "line":dict( color=bupu[4], width= 3)
    }
     


],
"layout": {
      "title": "PM2.5 Smoothing Parameters for Cluster No. {0} (Daily)".format(target)

    , "yaxis": {"title":"µg/m3"}
    , "legend": dict(orientation="h")
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)

Residuals

In [76]:
compare_resids_df =\
merge_models_df[["LOWESS","EWMA","HP FILTER"]].apply(lambda x: merge_models_df["EVIDENCE"]-x)


compare_resids_df.columns = [i+" RESIDUALS" for i in compare_resids_df.columns]
compare_resids_df = compare_resids_df.set_index(unstacked_df.index)
In [77]:
compare_resids_df = compare_resids_df**2

df = compare_resids_df.reset_index()
df["local_datetime"] = df["local_datetime"].astype(str)
In [78]:
chart_data = cf.subplots([df.figure(\

                          x='local_datetime'
                        , y="LOWESS RESIDUALS"
                        , colors=[bupu[5]]
                      ),
             df.figure(\

                          x='local_datetime'
                        , y="EWMA RESIDUALS"
                        , colors=[bupu[5]]
                       ),\
             df.figure(\

                          x='local_datetime'
                        , y="HP FILTER RESIDUALS"
                        , colors=[bupu[5]]
                       ),\
            ],shape=(3,1), theme="white",subplot_titles=(
                                                              "LOWESS SQUARED RESIDUALS".format(target)
                                                            , "EWMA SQUARED RESIDUALS".format(target_neighbors[0])
                                                            , "HP FILTER SQUARED RESIDUALS".format(target_neighbors[1])
))


chart_data['layout'].update(showlegend=False, plot_bgcolor='#FFFFFF', paper_bgcolor='#FFFFFF')

for i in range(0,3):
    chart_data['data'][i].update(mode="lines")
    
iplot(chart_data, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)

Cluster the Squared Residuals with a Density Based Method

In [79]:
def cluster_residuals( X ): 
    db = DBSCAN(\
                  eps=.9\
                , min_samples=4\
                , leaf_size=30\
                , metric="euclidean"\
               ).fit(X)

    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    return db.labels_
In [80]:
# function to map the db scan
f = lambda x: cluster_residuals( np.array(x.fillna(0)).reshape(-1, 1) )

# apply the map to the dataframe
flagged_df = compare_resids_df.apply(f, axis=0)\
                            .apply(lambda x: (x==-1)*1 ).replace(0,np.nan)
    
# multiply the indicators for each smoothing method by the original time series for the target
outliers_ts = flagged_df.apply(lambda x: x*unstacked_df[target], axis=0)
In [81]:
observed_plot_df = pd.DataFrame(unstacked_df.index)
observed_plot_df["EWMA RESIDUALS"]=unstacked_df[target].values
observed_plot_df["HP FILTER RESIDUALS"]=unstacked_df[target].values
observed_plot_df["LOWESS RESIDUALS"]=unstacked_df[target].values
observed_plot_df["Y PRED"]=0
In [82]:
outliers_plot_df = outliers_ts.copy().reset_index()
outliers_plot_df["Y PRED"]=1

ts_cat_df = pd.concat([observed_plot_df, outliers_plot_df], axis=0)
ts_cat_df["local_datetime"]=ts_cat_df["local_datetime"].astype(str)

Plot Model Results

In [83]:
df =ts_cat_df

chart_data = cf.subplots(\
                         [df.figure(\
                          kind='scatter'
                        , mode='markers'
                        , x='local_datetime'
                        , y="EWMA RESIDUALS"
                        , categories="Y PRED"
                        , colors=['rgb(0,0,0)', 'rgb(255,0,0)']
                      ),
             df.figure(\
                          kind='scatter'
                        , mode='markers'
                        , x='local_datetime'
                        , y="HP FILTER RESIDUALS"
                        , categories="Y PRED"
                        , colors=['rgb(0,0,0)', 'rgb(255,0,0)']
                       ),\
             df.figure(\
                          kind='scatter'
                        , mode='markers'
                        , x='local_datetime'
                        , y="LOWESS RESIDUALS"
                        , categories="Y PRED"
                        , colors=['rgb(0,0,0)', 'rgb(255,0,0)']
                       )
            ],shape=(2,2), theme="white",subplot_titles=(
                                                              "EWMA + dbScan"
                                                            , "HP FILTER + dbScan"
                                                            , "LOWESS + dbScan"
))


chart_data['layout'].update(showlegend=False, plot_bgcolor='#FFFFFF', paper_bgcolor='#FFFFFF')

for i in range(0,6):
    chart_data['data'][i]['marker'].update(size=5)
    
iplot(chart_data, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)

Model Implementation

In [84]:
def apply_lowess(x):
    lowess = sm.nonparametric.lowess
    
    sm_lowess_df = x.dropna()

    dates_df=pd.DataFrame(unstacked_df.index)

    x=sm_lowess_df.index
    y=sm_lowess_df

    w = lowess(y, x, frac=1./3)

    f = lambda x: x[0]
    output_df = pd.DataFrame(map(f,w[:,1:]), columns=["LOWESS"])
    output_df["local_datetime"]=x

    merged_df=dates_df.merge(output_df, on="local_datetime", how="left")

    return merged_df["LOWESS"]
In [85]:
smoother_applied_df = unstacked_df.apply(lambda x: apply_lowess(x)).set_index(unstacked_df.index)
calc_resids_applied_df = unstacked_df-smoother_applied_df
calc_sq_resids_applied_df = calc_resids_applied_df**2
In [86]:
# function to map the db scan
f = lambda x: cluster_residuals( np.array(x.fillna(0)).reshape(-1, 1) )

# apply the map to the dataframe
flagged_df = calc_sq_resids_applied_df.apply(f, axis=0)\
                            .apply(lambda x: (x==-1)*1 ).replace(0,np.nan)
    
# multiply the indicators for each smoothing method by the original time series for the target
outliers_ts = flagged_df*unstacked_df

Y Pred vs Observations

In [ ]:
midpoints_df["City"] = midpoints_df[["latitude", "longitude"]].apply(lambda x:\
                                              search.by_coordinate(\
                                                                      x[0]\
                                                                   , x[1]\
                                                                   , radius=30\
                                                                   , returns=1)[0].City\
                                              , axis=1)

midpoints_df["State"] = midpoints_df[["latitude", "longitude"]].apply(lambda x:\
                                              search.by_coordinate(\
                                                                      x[0]\
                                                                   , x[1]\
                                                                   , radius=30\
                                                                   , returns=1)[0].State\
                                              , axis=1)

midpoints_df["City_State"] =midpoints_df["City"]+", "+midpoints_df["State"]
cities_dict = midpoints_df.set_index("geo_cluster").to_dict("index")

str(cities_dict[50]["City_State"])
In [87]:
# outliers_ts

outliers_plot_df = outliers_ts.copy().reset_index()
outliers_plot_df["Y PRED"]=1

observed_plot_df = unstacked_df.copy().reset_index()
observed_plot_df["Y PRED"]=0

ts_cat_df = pd.concat([observed_plot_df, outliers_plot_df], axis=0)
ts_cat_df["local_datetime"]=ts_cat_df["local_datetime"].astype(str)
In [144]:
df =ts_cat_df

chart_data = cf.subplots(\
                         [df.figure(\
                          kind='scatter'
                        , mode='markers'
                        , x='local_datetime'
                        , y=target
                        , categories="Y PRED"
                        , colors=['rgb(0,0,0)', 'rgb(255,0,0)']
                      ),
             df.figure(\
                          kind='scatter'
                        , mode='markers'
                        , x='local_datetime'
                        , y=target_neighbors[0]
                        , categories="Y PRED"
                        , colors=['rgb(0,0,0)', 'rgb(255,0,0)']
                       ),\
             df.figure(\
                          kind='scatter'
                        , mode='markers'
                        , x='local_datetime'
                        , y=target_neighbors[1]
                        , categories="Y PRED"
                        , colors=['rgb(0,0,0)', 'rgb(255,0,0)']
                       ),\
             df.figure(\
                          kind='scatter'
                        , mode='markers'
                        , x='local_datetime'
                        , y=target_neighbors[2]
                        , categories="Y PRED"
                        , colors=['rgb(0,0,0)', 'rgb(255,0,0)']
                       )
            ],shape=(2,2), theme="white",subplot_titles=(
#                                                               "Cluster No. {0}".format(target)
#                                                             , "Cluster No. {0}".format(target_neighbors[0])
#                                                             , "Cluster No. {0}".format(target_neighbors[1])
#                                                             , "Cluster No. {0}".format(target_neighbors[2])
                                                              "Cluster No. {0}".format(str(cities_dict[target]["City_State"]))
                                                            , "Cluster No. {0}".format(str(cities_dict[target_neighbors[0]]["City_State"]))
                                                            , "Cluster No. {0}".format(str(cities_dict[target_neighbors[1]]["City_State"]))
                                                            , "Cluster No. {0}".format(str(cities_dict[target_neighbors[2]]["City_State"]))
))


chart_data['layout'].update(showlegend=False, plot_bgcolor='#FFFFFF', paper_bgcolor='#FFFFFF')

for i in range(0,8):
    chart_data['data'][i]['marker'].update(size=5)
    
iplot(chart_data, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
In [89]:
cpt_prep_df = flagged_df[target_with_neighbors]
cpt_prep_df=pd.concat([flagged_df[target][1:],cpt_prep_df[target_neighbors].shift(1)],axis=1)
In [90]:
cpt_prep_df = flagged_df[target_with_neighbors].copy()
In [91]:
new_names = [str(i)+"_lag_1d" for i in cpt_prep_df.columns.values]
cpt_prep_df.columns=new_names
In [92]:
cpt_prep_df = flagged_df[target_with_neighbors]
cpt_prep_lag_df=cpt_prep_df.copy().shift(1)

new_names = [str(i)+"_lag_1d" for i in cpt_prep_df.columns.values]
cpt_prep_lag_df.columns=new_names
In [93]:
cpt_prep_df=pd.concat([cpt_prep_df[1:],cpt_prep_lag_df.shift(1)],axis=1)
In [94]:
cpt_counts = cpt_prep_df.fillna(0).groupby(['1_lag_1d', '16_lag_1d',
       '50_lag_1d', '20_lag_1d', '24_lag_1d', '57_lag_1d', '58_lag_1d',
       '62_lag_1d']).sum()#.reset_index()
  • TODO: try this with a resample on the months too
In [95]:
cpt_df = cpt_counts/(len(cpt_prep_df)*1.00)
In [96]:
cpt_df
Out[96]:
1 16 50 20 24 57 58 62
1_lag_1d 16_lag_1d 50_lag_1d 20_lag_1d 24_lag_1d 57_lag_1d 58_lag_1d 62_lag_1d
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.035821 0.050746 0.028358 0.032836 0.013433 0.032836 0.025373 0.044776
1.0 0.007463 0.007463 0.004478 0.001493 0.001493 0.004478 0.005970 0.004478
1.0 0.0 0.000000 0.001493 0.000000 0.000000 0.001493 0.000000 0.000000 0.000000
1.0 0.001493 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.005970 0.002985 0.004478 0.002985 0.004478 0.011940 0.000000 0.001493
1.0 0.000000 0.000000 0.000000 0.000000 0.002985 0.001493 0.000000 0.001493
1.0 1.0 0.000000 0.000000 0.001493 0.001493 0.001493 0.000000 0.001493 0.000000
1.0 0.0 0.0 0.0 0.001493 0.001493 0.002985 0.002985 0.000000 0.001493 0.001493 0.001493
1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.000000 0.000000 0.001493 0.002985 0.000000 0.002985 0.002985 0.000000
1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.001493 0.000000 0.000000
1.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.002985 0.002985 0.001493 0.001493 0.001493 0.004478
1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.001493 0.000000 0.001493
1.0 0.0 0.001493 0.000000 0.001493 0.001493 0.000000 0.000000 0.001493 0.001493
1.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.000000 0.001493 0.000000 0.000000 0.000000 0.001493 0.000000 0.000000
1.0 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.001493 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.0 0.0 0.0 0.002985 0.004478 0.004478 0.002985 0.000000 0.001493 0.002985 0.001493
1.0 0.000000 0.000000 0.001493 0.000000 0.000000 0.001493 0.000000 0.001493
1.0 0.0 1.0 0.000000 0.001493 0.001493 0.001493 0.001493 0.002985 0.002985 0.002985
1.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 1.0 0.0 0.0 0.001493 0.000000 0.001493 0.001493 0.000000 0.001493 0.001493 0.001493
1.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.001493 0.000000 0.001493 0.000000 0.000000
1.0 0.000000 0.000000 0.001493 0.001493 0.001493 0.001493 0.001493 0.001493
1.0 0.0 0.000000 0.000000 0.000000 0.001493 0.000000 0.000000 0.000000 0.000000
1.0 0.000000 0.000000 0.000000 0.000000 0.001493 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.000000 0.000000 0.001493 0.001493 0.001493 0.001493 0.000000 0.001493
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.001493 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 1.0 0.000000 0.001493 0.000000 0.000000 0.001493 0.001493 0.001493 0.000000
1.0 0.0 0.0 0.0 0.0 0.000000 0.002985 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 1.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.010448 0.008955 0.004478 0.002985 0.001493 0.002985 0.001493 0.004478
1.0 0.000000 0.000000 0.001493 0.000000 0.000000 0.001493 0.000000 0.002985
1.0 0.0 0.000000 0.004478 0.001493 0.001493 0.000000 0.000000 0.000000 0.001493
1.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.0 1.0 0.001493 0.001493 0.000000 0.000000 0.000000 0.001493 0.000000 0.001493
1.0 0.0 0.000000 0.001493 0.000000 0.001493 0.000000 0.000000 0.000000 0.000000
1.0 1.0 1.0 0.001493 0.000000 0.001493 0.001493 0.001493 0.001493 0.001493 0.001493
1.0 0.0 0.0 0.0 0.0 0.0 0.001493 0.000000 0.001493 0.001493 0.000000 0.001493 0.000000 0.001493
1.0 0.001493 0.000000 0.001493 0.001493 0.000000 0.001493 0.001493 0.001493
1.0 0.0 0.001493 0.000000 0.001493 0.000000 0.000000 0.001493 0.002985 0.000000
1.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.001493 0.000000 0.000000
1.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.002985 0.000000 0.000000 0.000000 0.001493 0.001493 0.000000 0.002985
1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.000000 0.000000 0.001493 0.001493 0.000000 0.001493 0.000000 0.000000
1.0 1.0 1.0 1.0 0.001493 0.000000 0.001493 0.001493 0.000000 0.000000 0.001493 0.000000
1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.001493 0.000000 0.001493 0.000000 0.000000 0.000000 0.000000 0.001493
1.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1.0 0.0 0.0 0.001493 0.000000 0.000000 0.001493 0.000000 0.001493 0.000000 0.001493
1.0 0.000000 0.002985 0.000000 0.000000 0.000000 0.002985 0.000000 0.001493
1.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.001493 0.001493 0.000000 0.000000 0.001493 0.000000 0.001493
1.0 0.0 1.0 1.0 1.0 0.001493 0.001493 0.001493 0.001493 0.001493 0.000000 0.001493 0.001493
1.0 0.0 1.0 1.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.001493 0.001493

76 rows × 8 columns

Bayes Net

In [97]:
# resample to montly
resample_m_df = outliers_ts.resample("m").count()
In [98]:
resample_m_df[[20, 24, 57, 58, 62,50]].plot()
Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ba25dd0>
In [ ]:
 
In [103]:
 
In [101]:
midpoints_df.head()
Out[101]:
geo_cluster latitude longitude
0 0 37.868930 -86.189384
1 1 39.135479 -119.946941
2 2 20.414953 -156.580203
3 3 41.442846 -77.751986
4 4 47.313045 -116.850417
In [108]:
from uszipcode import ZipcodeSearchEngine
search = ZipcodeSearchEngine()
res = search.by_coordinate(39.122229, -77.133578, radius=30, returns=1)
In [ ]:
res = search.by_coordinate(39.122229, -77.133578, radius=30, returns=1)
In [122]:
res[0]
Out[122]:
{"City": "Derwood", "Density": 1090.3890160183066, "HouseOfUnits": 5018, "LandArea": 13.11, "Latitude": 39.13298029999999, "Longitude": -77.1370115, "NEBoundLatitude": 39.1858399, "NEBoundLongitude": -77.099876, "Population": 14295, "SWBoundLatitude": 39.10157279999999, "SWBoungLongitude": -77.176203, "State": "MD", "TotalWages": 502406808.0, "WaterArea": 0.25, "Wealthy": 35145.631899265485, "Zipcode": "20855", "ZipcodeType": "Standard"}
midpoints_df[["latitude", "longitude"]].apply(lambda x:\ search.by_coordinate(\ 39.122229\ , -77.133578\ , radius=30\ , returns=1)[0].City\ , axis=1)
In [143]:
 
Out[143]:
'Morgan Hill, CA'
In [ ]:
 
In [140]:
 
In [ ]: